<!--
   Copyright 2024 The Google Research Authors.
  
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
  
       http://www.apache.org/licenses/LICENSE-2.0
  
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
-->

<!DOCTYPE html>
<meta charset="utf-8">
<title>MarotUtils tests</title>
<html lang="en">

<link rel="stylesheet" type="text/css" href="marot.css"/>
<script src="marot-utils.js"></script>

<style>
  .marot-test-row td {
    border: 1px solid blue;
  }
  .marot-test-table {
    font-family: monospace;
    font-size: 16px;
  }
  .marot-test-table td {
    box-sizing: border-box;
    padding: 8px;
    width: 40%;
    vertical-align: top;
  }
  .marot-textarea {
    box-sizing: border-box;
    color: gray;
    width: 100%;
    height: 300px;
  }
  .subpara-0 {
    outline: 1px solid lightblue;
  }
  .subpara-0:hover {
    background: whitesmoke;
    outline: 1px solid blue;
  }
  .subpara-1 {
    outline: 1px solid lightgreen;
  }
  .subpara-1:hover {
    background: whitesmoke;
    outline: 1px solid green;
  }
  .src-subpara-highlighted {
    background: lavender;
  }
  .tgt-subpara-highlighted {
    background: lavender;
  }
  .tgt-sentence-highlighted {
    font-weight: bold;
  }
  .tgt-token-highlighted {
    text-decoration: underline;
  }
</style>

<script>
  /**
   * Returns HTML for a segment, with each paragraph, each subpara, each sentence, each token
   * getting its own id.
   * @param {string} idPrefix
   * @param {!Array<string>} tokens
   * @param {!Array<!Object>} sentences
   * @param {!Array<number>} subparas
   * @return {string}
   */
  function renderText(idPrefix, tokens, sentences, subparas) {
    let rendered = '';
    let paragraphIndex = 0;
    rendered += `<p class="${idPrefix}-paragraph" id="${idPrefix}-paragraph-${paragraphIndex}">`;
    for (let p = 0; p < subparas.length; p++) {
      const subpara = subparas[p];
      rendered +=
        `<span id="${idPrefix}-subpara-${p}" class="${idPrefix}-subpara subpara-${p % 2}">`;
      let subparaSpanClosed = false;
      for (let si = 0; si < subpara.length; si++) {
        const s = subpara[si];
        const isLast = (si == subpara.length - 1);
        const sentence = sentences[s];
        rendered += `<span class="${idPrefix}-sentence" id="${idPrefix}-sentence-${s}">`;
        const spannedTokens = tokens.slice(sentence.offset, sentence.offset + sentence.num_tokens);
        for (let t = 0; t < sentence.num_tokens; t++) {
          spannedTokens[t] =
            `<span class=${idPrefix}-token" id="${idPrefix}-token-${t + sentence.offset}">` +
            spannedTokens[t] + '</span>';
        }
        rendered += spannedTokens.join('');
        rendered += '</span>';
        if (sentence.ends_with_line_break) {
          rendered += '<br>';
        } else if (sentence.ends_with_para_break) {
          paragraphIndex++;
          rendered += '</span></p>';
          rendered += `<p class="${idPrefix}-paragraph-${paragraphIndex}"
              id="${idPrefix}-paragraph-${paragraphIndex}">`;
          subparaSpanClosed = true;
        }
      }
      if (!subparaSpanClosed) {
        rendered += '</span>'
      }
    }
    rendered += '</p>';
    return rendered;
  }

  /**
   * Runs Marot.makeSubparas with the given parameters and settings. Also tests self-alignment for
   * the subparas created.
   * @param {!Array<string>} tokens
   * @param {!Array<!Object>} sentences
   * @param {number} maxSentences
   * @param {number} maxTokens
   */
  function testMakeSubparas(id, tokens, sentences, maxSentences, maxTokens) {
    const subparas = MarotUtils.makeSubparas(sentences, maxSentences, maxTokens);
    let expectedS = 0;
    for (const p of subparas) {
      console.assert(p.length > 0, p);
      console.assert(p.length <= maxSentences, p.length, id);
      console.assert(p[0] == expectedS, p[0], expectedS, id);
      let numTokens = 0;
      for (const s of p) {
        console.assert(s < sentences.length, s, sentences.length, id);
        console.assert(s == expectedS, s, expectedS, id);
        expectedS++;
        numTokens += sentences[s].num_tokens;
      }
      if (p.length > 1) {
        console.assert(numTokens <= maxTokens, numTokens, id);
      }
    }
    /**
     * Test that auto-alignment is the identity function.
     */
    const structure = MarotAligner.getAlignmentStructure(sentences, subparas);
    const alignment = new MarotAligner(structure, structure);
    for (let t = 0; t < structure.num_tokens; t++) {
      const p = MarotUtils.binSearch(structure.subparaTokens, t + 1);
      console.assert(p >= 0 && p < subparas.length, t, p, subparas, id);
      const alignedP = alignment.tgtTokenToSrcSubpara(t);
      console.assert(p == alignedP, p, alignedP, id);
    }
  }

  const testTextSegments = [
    '',
    'One-1.',
    'One-1. Two-2.',
    'One-1. Two-2. Three-3. Four-4. Five-5.',
    'One-1. Two-2. Three-3. Four-4. Five-5. Six-6. Seven-7. Eight-8. Nine-9. Ten-10.',
    `Lot of tokens and two paras: 0 1 2 3 4 5 6 7 8 9 10
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30

    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
    11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30.`,
    `A realistic example. Should we write long test chunks with no paragraph breaks?
    A study published in the Journal of Made Up Research claims that yes, we should.
    The study suggests that it is extremely important to drink coffee while reading sesquipedalian words and long-winding sentences.
    The End!
    OK, not quite the end, there is more to be said.
    The extra bits are as follows:
    Bit1. Bit2. Bit3.
    PS:

    This is a second paragraph already specified.`,
    `A test for the case that used to create unnecessarily small 1-sentence subparas at the end.
     When max sentences is 2 or more and max tokens is such that 3 sentences would be too many.
     Something like 80 tokens would trigger the behavior.
     #4: A short sentence.
     #5: A short sentence.
     #6: A short sentence.
     #7: A short sentence.
     #8: A short sentence.
     #9: A short sentence.
     #10: A short sentence.
     #11: A short sentence.
     #12: A short sentence.
     #13: A m a n y t o k e n e d s e n t e n c e not combinable with the n e x t sent at 80 tokens.
     #14: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3.
     #15: A short sentence.
     #16: A short sentence.
     #17: A short sentence.
     `,
     `A poem preceded by a paragraph.

     ’Twas brillig, and the slithy toves
      Did gyre and gimble in the wabe:
      All mimsy were the borogoves,
      And the mome raths outgrabe.

      The next para. This one preceded by just a line break:
      “Beware the Jabberwock, my son!
      The jaws that bite, the claws that catch!
      Beware the Jubjub bird, and shun
      The frumious Bandersnatch!”
      This sentence has no line-break. But this one does.

      Another poem. After three sentences. The third of the sentences.
      A poem must not mean
      But be. 
      `,
  ];

  /**
   * Runs all the automated tests.
   */
  function runAutomatedTests() {
    for (let index = 0; index < testTextSegments.length; index++) {
      const text = testTextSegments[index];
      const idPrefix = 'Seg-' + index + '-';
      const tokenization = MarotUtils.tokenizeText(text);
      const tokens = tokenization.tokens;
      const sentences = tokenization.sentence_splits;
      testMakeSubparas(idPrefix + 'MAX', tokens, sentences,
                       Number.MAX_SAFE_INTEGER, Number.MAX_SAFE_INTEGER);
      for (let maxSentences = 8; maxSentences >= 1; maxSentences--) {
        for (const maxTokens of [1, 5, 80, 200, 400]) {
          testMakeSubparas(idPrefix + maxSentences + '-' + maxTokens,
                           tokens, sentences, maxSentences, maxTokens);
        }
      }
    }
  }

  /**
   * Shows tables for manual testing
   */
  function showManualTests() {
    const sentencesInput = document.getElementById('marot-test-max-sentences');
    let maxSentences = parseInt(sentencesInput.value);
    if (isNaN(maxSentences) || maxSentences <= 0) {
      maxSentences = 3;
    }
    sentencesInput.value = maxSentences;
    const tokensInput = document.getElementById('marot-test-max-tokens');
    let maxTokens = parseInt(tokensInput.value);
    if (isNaN(maxTokens) || maxTokens <= 0) {
      maxTokens = 100;
    }
    tokensInput.value = maxTokens;

    const segs = [
      document.getElementById('marot-test-src').value,
      document.getElementById('marot-test-tgt').value
    ];
    let html = '';
    const structures = [];
    for (const index of [0, 1]) {
      const text = segs[index];
      const idPrefix = (index == 0) ? 'src' : 'tgt';
      const tokenization = MarotUtils.tokenizeText(text);
      const tokens = tokenization.tokens;
      const sentences = tokenization.sentence_splits;
      const subparas = MarotUtils.makeSubparas(sentences, maxSentences, maxTokens);
      html += `
        <td>
          ${renderText(idPrefix, tokens, sentences, subparas)}
        </td>`;
      const structure = MarotAligner.getAlignmentStructure(sentences, subparas);
      structure.tokens = tokens;
      structures.push(structure);
    }
    const testTableRow = document.getElementById('marot-test-row');
    testTableRow.innerHTML = html;
    const aligner = new MarotAligner(structures[0], structures[1]);

    const idMap = {};
    const addToMap = (map, x, y) => {
      if (!map.hasOwnProperty(x)) {
        map[x] = new Set;
      }
      map[x].add(y);
    };
    const eltHighlighter = (id, add) => {
      const elt = document.getElementById(id);
      console.assert(elt, id);
      const lastDash = id.lastIndexOf('-');
      console.assert(lastDash >= 0, id);
      const cls = id.substr(0, lastDash) + '-highlighted';
      if (add) {
        elt.classList.add(cls);
      } else {
        elt.classList.remove(cls);
      }
    }
    const mapHighlighter = (id, add) => {
      eltHighlighter(id, add);
      for (const mid of idMap[id] ?? []) {
        eltHighlighter(mid, add);
      }
      return true;
    }
    const activateId = (id) => {
      const elt = document.getElementById(id);
      console.assert(elt, id);
      elt.addEventListener('mouseover', e => mapHighlighter(id, true));
      elt.addEventListener('mouseout', e => mapHighlighter(id, false));
    }
    for (let tp = 0; tp < structures[1].subparas.length; tp++) {
      const range = aligner.tgtSubparaToSrcSubparaRange(tp);
      const tgtId = 'tgt-subpara-' + tp;
      for (let sp = range[0]; sp <= range[1]; sp++) {
        const srcId = 'src-subpara-' + sp;
        addToMap(idMap, srcId, tgtId);
      }
      activateId(tgtId);
    }
    for (let ts = 0; ts < structures[1].sentences.length; ts++) {
      const range = aligner.tgtSentenceToSrcSubparaRange(ts);
      const tgtId = 'tgt-sentence-' + ts;
      for (let sp = range[0]; sp <= range[1]; sp++) {
        const srcId = 'src-subpara-' + sp;
        addToMap(idMap, srcId, tgtId);
      }
      activateId(tgtId);
    }
    for (let tt = 0; tt < structures[1].tokens.length; tt++) {
      const sp = aligner.tgtTokenToSrcSubpara(tt);
      const tgtId = 'tgt-token-' + tt;
      const srcId = 'src-subpara-' + sp;
      addToMap(idMap, srcId, tgtId);
      addToMap(idMap, tgtId, srcId);
      activateId(tgtId);
    }
    for (let sp = 0; sp < structures[0].subparas.length; sp++) {
      const srcId = 'src-subpara-' + sp;
      activateId(srcId);
    }
  }

  /**
   * Copy from a canned text segment into src/tgt.
   * @param {number} seg
   */
  function setTestSegment(seg) {
    const textarea = (seg == 1) ? document.getElementById('marot-test-src')
                                : document.getElementById('marot-test-tgt');
    const selector = document.getElementById('marot-test-seg-' + seg);
    const index = selector.value;
    textarea.value = testTextSegments[index];
    /**
     * Reset, so that a 'change' event will be triggered even if the same segment is picked again
     * (as the user may have manually edited its text).
     */
    selector.value = -1;
  }
</script>

<body>
  <div id="marot-test-status">
    <b style="color:red">Running automated tests...</b>
  </div>

  <h2>Subpara-splitting and alignment testing</h2>

  <p>
    <b>Subparas max limits:</b>
    Sentences:
      <input onchange="showManualTests()"
          type="text" size="4" id="marot-test-max-sentences" value="3"/>
    Tokens:
      <input onchange="showManualTests()"
          type="text" size="6" id="marot-test-max-tokens" value="100"/>
  </p>

  <p>
    <i>
      Split both text segments into tokens, sentences, subparas, and align them.
      You can manually edit/replace the text.
    </i>
  </p>

  <table class="marot-test-table" id="marot-test-table">
    <tr>
      <td>
        <select oninput="setTestSegment(1); showManualTests()" id="marot-test-seg-1">
          <option value="0">Copy from Test-Segment-0</option>
          <option value="1">Copy from Test-Segment-1</option>
          <option value="2">Copy from Test-Segment-2</option>
          <option value="3">Copy from Test-Segment-3</option>
          <option value="4">Copy from Test-Segment-4</option>
          <option value="5">Copy from Test-Segment-5</option>
          <option value="6">Copy from Test-Segment-6</option>
          <option value="7">Copy from Test-segment-7</option>
          <option value="8" selected>Copy from Test-segment-8</option>
        </select>
      </td>
      <td>
        <select oninput="setTestSegment(2); showManualTests()" id="marot-test-seg-2">
          <option value="0">Copy from Test-Segment-0</option>
          <option value="1">Copy from Test-Segment-1</option>
          <option value="2">Copy from Test-Segment-2</option>
          <option value="3">Copy from Test-Segment-3</option>
          <option value="4">Copy from Test-Segment-4</option>
          <option value="5">Copy from Test-Segment-5</option>
          <option value="6" selected>Copy from Test-Segment-6</option>
          <option value="7">Copy from Test-Segment-8</option>
          <option value="8">Copy from Test-segment-8</option>
        </select>
      </td>
    </tr>
    <tr class="marot-test-src-tgt">
      <td>
        <textarea id="marot-test-src" oninput="showManualTests()"
            class="marot-textarea">
        </textarea>
      </td>
      <td>
        <textarea id="marot-test-tgt" oninput="showManualTests()"
            class="marot-textarea">
        </textarea>
      </td>
    </tr>
    <tr id="marot-test-row" class="marot-test-row">
    </tr>
  </table>

  <p>
    Note that alignments are built upon a base of "target-token-to-source-subpara"
    alignment. If you hover over a source paralat, then:
    <i>
      <ul>
        <li>Each target token that aligns to it gets underlined.</li>
        <li>Each target sentence that aligns to it gets bolded.</li>
        <li>Each target subpara that aligns to it gets highlighted.</li>
      </ul>
    </i>
    If you hover over a target token, then:
    <i>
      <ul>
        <li>The target token gets underlined.</li>
        <li>The enclosing target sentence gets bolded.</li>
        <li>The enclosing target subpara gets highlighted.</li>
        <li>The aligned source subpara for only the target token gets highlighted.</li>
      </ul>
    </i>
  </p>

  <script>
    runAutomatedTests();
    document.getElementById('marot-test-status').innerHTML =
        '<b style="color:green">Finished running automated tests.</b>';
    setTestSegment(1);
    setTestSegment(2);
    showManualTests();
  </script>

</body>
</html>

